Progetto

Trasformazione del cinema e
servizi di streaming

Mejric Maroan

Introduzione

library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)
library(magrittr)
library(ggthemes)
library(reshape2)
library(stringr)
library(readr)
library(stringi)

shark <- read.csv("attacks.csv")
shark <- select(shark, -c(pdf, href.formula, href, Case.Number.1, Case.Number.2, original.order, X, X.1))
shark <- shark %>% filter(!(is.na(Year)))

'%!in%' <- function(x,y)!('%in%'(x,y)) #fa l'opposto di %in%
colnames(shark) <- c("Case Number", "Date", "Year", "Type", "Country", "Area", "Location", 
"Activity", "Name", "Sex", "Age", "Injury", "Fatal_Y_N", "Time", "Species", "Investigator or Source")

temp = shark %>% filter(Year != 0) #rimuove righe che hanno come anno 0

Tempy <- shark$Area
Tempy %<>% tbl_df() %>% filter(value != "") #lista-->tibble
Tempy %<>% arrange((value)) #riordina
Temp1 <- Tempy
Temp1$Match = 0
Temp1$Match <- Temp1$value %>% str_extract("\\d") #preleva il primo numero che trova nella campo
Temp1 %<>% na.omit() #tengo solo le aree che presentano numeri

temp %<>% filter(Area %!in% Temp1$value) #rimuove righe che hanno aree che presentano numeri

NUMBER OF SHARK ATTACKS VS. YEAR (ORDERED BY YEAR)

Year_Shark_Year <- count(group_by(temp, Year)) %>% #tabella con colonne (anno, numero attacchi)
  filter(Year %!in% c(5, 77, 500)) %>%
  filter(Year > 1979 && Year < 2018) %>%
  ggplot() +
  geom_bar(mapping = aes(reorder(Year, Year),y = n, fill=n), stat="identity") + 
  scale_fill_gradient("Attacchi", low="yellow", high = "red") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(title = "Numero attacchi di squalo per anno", x = "Anno", y = "Numero di attacchi", fill = "Attacchi")
ggplotly(Year_Shark_Year)

Number of Shark Attacks vs. Country

Shark_Year <- 
count(group_by(temp, Country)) %>%
  filter(n > 58) %>%
  ggplot() +
  geom_bar(mapping = aes(reorder(Country, n),y = n, fill=n), stat="identity") + 
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  scale_fill_gradient("Attacchi", low="orange", high = "red") +
  labs(title = "Numero attacchi di squalo per paese", x = "Paese", y = "Numero di attacchi", fill = "Attachi") + coord_flip()
ggplotly(Shark_Year) 

FATALITIES

Yes_No <- c("Y", "N")
Fatal <- count(group_by(temp, Fatal_Y_N)) %>%
  filter(Fatal_Y_N %in% Yes_No) #rimuove eventuali altri gruppi

Fatal$sum = sum(Fatal$n)
Fatal$Percentage = 0
for(i in 1:nrow(Fatal)){
Fatal$Percentage[i] <- Fatal$n[i]/Fatal$sum[i]  #calcolo percentuale per riga
}
attach(Fatal)
plot_ly(Fatal, labels=Fatal_Y_N, values = Percentage, type="pie", marker = list(colors = c('yellowgreen', '#DF2B0D'))) %>%
  layout(title = "Fatalità degli attacchi di squalo in percentuale", paper_bgcolor='#f8f4f4')
attack <- temp %>% select(c(Area, Fatal_Y_N))
attack$Fatal <- attack$Fatal_Y_N
attack %<>% filter(Area != "")   #rimuove righe con campo Area vuoto
attack %<>% arrange(Area)

overall_tally <- count(group_by(attack, Area)) #teniamo solo aree che hanno subito più di 50 attacchi
overall_tally %<>% filter(n > 100)
names <- overall_tally$Area %>% unique() %>% dput() %>% invisible()
names %<>% tbl_df() #converte a tibble

area_attack_fatal <- count(group_by(attack, Area, Fatal))  #raggruppa in base alle aree e alla fatalità dell'attacco
area_attack_fatal1 <- area_attack_fatal
area_attack_fatal1 %<>% filter(Area %in% names$value)
area_attack_fatal1 %<>% filter(Fatal %in% Yes_No)
area_attack_fatal1
## # A tibble: 22 x 3
## # Groups:   Area, Fatal [22]
##    Area                  Fatal     n
##    <chr>                 <chr> <int>
##  1 California            N       240
##  2 California            Y        18
##  3 Eastern Cape Province N       125
##  4 Eastern Cape Province Y        22
##  5 Florida               N       911
##  6 Florida               Y        45
##  7 Hawaii                N       211
##  8 Hawaii                Y        46
##  9 KwaZulu-Natal         N       123
## 10 KwaZulu-Natal         Y        43
## # ... with 12 more rows
Attack_Area <- area_attack_fatal1 %>%
  ggplot() +
  geom_bar(mapping = aes(reorder(Area, n), y=n, fill=Fatal), position="dodge", stat='identity') +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  scale_fill_manual(values = c('yellowgreen', '#DF2B0D'))+
  labs(title = "Numero attacchi squalo per area", x = "Area", y = "Numero di attacchi", fill = "Fatalità") + coord_flip()
ggplotly(Attack_Area)

#NUMBER OF SHARK ATTACKS VS. COUNTRY VS. FATALITY

attack_by_country <- temp %>% select(c(Country, Fatal_Y_N))
attack_by_country %<>%  filter(Fatal_Y_N %in% Yes_No)
attack_by_country$Fatal <- attack_by_country$Fatal_Y_N
attack_by_country %<>% filter(Country != "")
attack_by_country %<>% arrange(Country)


overall_tally_country <- count(group_by(attack_by_country, Country))  #paese|numero attacchi totale
overall_tally_country %<>% filter(n > 30)
names_country <- overall_tally_country$Country %>% unique() %>% dput() 
names_country %<>% tbl_df()
Country_attack_by_country_fatal <- tally(group_by(attack_by_country, Country, Fatal))
Country_attack_by_country_fatal %<>% filter(Country %in% names_country$value)
Country_attack_by_country_fatal1 <- Country_attack_by_country_fatal
Country_attack_by_country_fatal1 %<>% select(c(Country, Fatal, n))
Country_attack_by_country_fatal1$sum = 0

for(i in 1:nrow(Country_attack_by_country_fatal1)){     #somma correttà solo sui Fatal=N
Country_attack_by_country_fatal1$sum[i] <- Country_attack_by_country_fatal$n[i] + Country_attack_by_country_fatal$n[i+1]
}

for(i in 1:nrow(Country_attack_by_country_fatal1)){    #correggiamo la somma sui Fatal=Y
  if(i %% 2 == 0){
    Country_attack_by_country_fatal1$sum[i] <- Country_attack_by_country_fatal1$sum[i-1]
  }
}


Shark_Fatal_Country <- Country_attack_by_country_fatal1 

Shark_Fatal_Country$Survive = 0
for(i in 1:nrow(Shark_Fatal_Country)){
  Shark_Fatal_Country$Survive[i] <- Shark_Fatal_Country$n[i]/Shark_Fatal_Country$sum[i]
}
Shark_Fatal_Country$Death = 0
for(i in 1:nrow(Shark_Fatal_Country)){
  if(i %% 2 == 0){
  Shark_Fatal_Country$Death[i] <- 1-Shark_Fatal_Country$Survive[i-1]
  }
}

Chance of Dying from a Shark Attack by Country

Shark_Fatal_Country_Plot_Die <- Shark_Fatal_Country %>%
  filter(Fatal == "Y") %>%
  ggplot() +
  geom_bar(mapping = aes(reorder(Country, -Death), y=Death, fill=Death, text=paste("Number of Deaths: ", n, sep="")), stat='identity') +
  scale_fill_gradient("Chance", low="yellow", high = "red") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 90))+
  labs(title = "Percent Chance of Death from Shark Attack (By Country)", x = "Country", y = "Percent Chance of Death from Shark Attack")
ggplotly(Shark_Fatal_Country_Plot_Die)

Chance of Surviving a Shark Attack by Country

Shark_Fatal_Country_perc <- Shark_Fatal_Country
Shark_Fatal_Country_perc$Survive <- Shark_Fatal_Country_perc$Survive * 100 
Shark_Fatal_Country_Plot_Survive <- Shark_Fatal_Country_perc %>%
  filter(Fatal == "N") %>%
  ggplot() +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  geom_bar(mapping = aes(reorder(Country, Survive), y=Survive, fill=Survive, text=paste("Number of Survivals: ", n, sep="")), stat='identity') +
  scale_fill_gradient("Count", low="red", high = "yellowgreen") +
  labs(title = "Percent Chance of Survival By Shark Attack (By Country)", x = "Country", y = "Survival Rate of Shark Attack (in %)") + coord_flip()
ggplotly(Shark_Fatal_Country_Plot_Survive)
Attack_Country <- Country_attack_by_country_fatal1 %>% 
  ggplot() +
  geom_bar(mapping = aes(reorder(Country, n), y=n, fill=Fatal), position='dodge', stat='identity') +
  scale_fill_manual(values = c('yellowgreen', '#DF2B0D'))+
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  labs(title = "Number of Shark Bites: Fatal and Non Fatal", x = "Country", y = "Number of Shark Bites", fill = "Fatal?") + coord_flip()
ggplotly(Attack_Country)

Number of Shark Attacks vs Age of Victims

Attack_Age <-
count(group_by(temp, Age)) %>%
  filter(Age != "") %>%
  filter(n > 9) %>%
  filter(n != 2568) %>%
  filter(Age %!in% c(1,2,3,4,5,6,7,8,9,61,69)) %>%
  ggplot() +
  geom_bar(mapping = aes(Age, y=n, fill=n), stat="identity") +
  scale_fill_gradient("Count", low="yellow", high = "red") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  labs(title = "Number of Shark Attacks vs. Age of Victim", x = "Age of Victim", y = "Number of Shark Attacks", fill = "Count")
ggplotly(Attack_Age)

Number of Fatalities vs. Number of Non Fatalities for Each Activity

shark2 <- shark
shark2 %<>% filter(Fatal_Y_N %in% c("Y", "N")) 

top_30_activity <- tally(group_by(shark2,Activity)) %>%
  arrange(desc(n)) %>%
  filter(Activity != "") %>%
  select(Activity,n)

top_30_activity %<>% slice(1:30)
activities <- top_30_activity$Activity %>% unique() %>% dput()
activities %<>% tbl_df()

Activity_Fatal <- count(group_by(shark2,Activity,Fatal_Y_N)) %>%
  arrange(desc(n)) %>%
  filter(Activity != "")

Activity_Fatal %<>% filter(Activity %in% activities$value)
Activity_Fatal_Plot <- Activity_Fatal %>%
  ggplot() +
  geom_bar(mapping = aes(reorder(Activity, -n), y=n, fill=Fatal_Y_N), position ='dodge', stat='identity') +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  scale_fill_manual(values = c('yellowgreen', '#DF2B0D'))+
  theme(axis.text.x = element_text(angle = 90))  +
  theme(plot.title = element_text(face = "bold")) +
  labs(title = "Shark Attack Fatalities with Activity", x = "Activity", y = "Number of Shark Attacks", fill = "Fatal")
ggplotly(Activity_Fatal_Plot)

Percent of Fatal Shark Attacks per each Activity

Yes_Fatal <- Activity_Fatal 
Yes_Fatal %<>% filter(Fatal_Y_N == "Y")

No_Fatal <- Activity_Fatal
No_Fatal %<>% filter(Fatal_Y_N == "N")
 
Both_Fatal <- inner_join(Yes_Fatal, No_Fatal, by ="Activity")
names(Both_Fatal) <- c("Activity","Fatal_Y_N.x", "Number_of_Fatalities","Fatal_Y_N.y", "Number_of_Non-Fatalities")
Both_Fatal %<>% remove_missing()
Both_Fatal %<>% select(-c(Fatal_Y_N.x, Fatal_Y_N.y))

Both_Fatal$Percent_Fatality = 0
 for(i in 1:nrow(Both_Fatal)){
   Both_Fatal[i, "Percent_Fatality"] <- Both_Fatal[i, "Number_of_Fatalities"]/(Both_Fatal[i, "Number_of_Fatalities"] + Both_Fatal[i, "Number_of_Non-Fatalities"])
 }


Bar_Fatality_Percent <- Both_Fatal %>%
  ggplot() +
  geom_bar(mapping = aes(reorder(Activity, -Percent_Fatality), y=Percent_Fatality, fill = Percent_Fatality), stat="identity") +
  scale_fill_gradient("Count", low="yellow", high = "red") +
  scale_color_fivethirtyeight() +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(plot.title = element_text(face = "bold")) + 
  labs(title = "Percent Frequency of Fatality with Shark Attacks in Relation to Activity", y = "Fatality Percent", fill = "Percent Fatality") +
  theme(plot.title = element_text(size = 11)) +
  labs(title = "Percent Fatality with Shark Attacks in Relation to Activity") +
  labs(x = "Activity")
ggplotly(Bar_Fatality_Percent)